import random
import pandas as pd
import numpy as np
import copy

def pick_random_protected_group(df, cols):
    df_s = df[cols]
    unique_values = []
    for i in list(df_s):
        for k in df_s[i].unique():
            unique_values.append((i,k))
    key, key_value = random.choice(unique_values)
    return key, key_value

def singular_row_dot_product(row, coefficients,sigma_true, starting_sum = 0):
    noise = singular_row_gaussian_draw(0,0,sigma_true)
    sum_total =  noise
    for idx,i in enumerate(list(row)):
        sum_total = sum_total + (coefficients[idx]*i)
    return [sum_total, noise]

def produce_true_log_odds(df, cols, protected_group_ind, _Delta, sigma_true, map_coefficients = {}, low = -1, high = 1):
    df = df.copy(deep = True)
    ind_col = df[protected_group_ind]
    if protected_group_ind in list(df):
        del df[protected_group_ind]
    df_dummy = pd.get_dummies(df[cols], drop_first = False)
    coefficients = []
    #map_coefficients = {}
    print(len(map_coefficients.keys()))
    if (len(map_coefficients.keys()) == 0):
        for i in range(0,len(list(df_dummy))):
            r = random.uniform(low,high)
            coefficients.append(r)
            map_coefficients[list(df_dummy)[i]] = r
    else:
        for i in range(0,len(list(df_dummy))):
            #r = random.uniform(-1,1)
            coefficients.append(map_coefficients[list(df_dummy)[i]])
            #map_coefficients[list(df_dummy)[i]] = r
    info = df_dummy.apply(singular_row_dot_product, axis =1, coefficients = coefficients,sigma_true = sigma_true, starting_sum =  0)
    df_dummy = pd.concat([df_dummy, ind_col], axis = 1 )
    log_odds = [x[0] for x in info]
    noise = [x[1] for x in info]
    #print(combo_info)
    df_dummy["log_odds"] = log_odds
    df_dummy["rowwise_noise"] = noise
    df_dummy.loc[(df_dummy[protected_group_ind] ==1), "log_odds"] = df_dummy.loc[(df_dummy[protected_group_ind] ==1), "log_odds"] + _Delta
    df_dummy.loc[(df_dummy[protected_group_ind] ==0), "log_odds"] = df_dummy.loc[(df_dummy[protected_group_ind] ==0), "log_odds"] - _Delta
    return df_dummy["log_odds"], map_coefficients, df_dummy["rowwise_noise"]

def produce_true_log_odds_gaussian(df, cols, protected_group_ind, _Delta, sigma_true, map_coefficients = {}, sigma_coef = .2):
    df = df.copy(deep = True)
    ind_col = df[protected_group_ind]
    if protected_group_ind in list(df):
        del df[protected_group_ind]
    df_dummy = pd.get_dummies(df[cols], drop_first = False)
    coefficients = []
    #map_coefficients = {}
    print(len(map_coefficients.keys()))
    if (len(map_coefficients.keys()) == 0):
        for i in range(0,len(list(df_dummy))):
            r = np.random.normal(0,sigma_coef, 1)[0]
            coefficients.append(r)
            map_coefficients[list(df_dummy)[i]] = r
    else:
        for i in range(0,len(list(df_dummy))):
            #r = random.uniform(-1,1)
            coefficients.append(map_coefficients[list(df_dummy)[i]])
        
        
    info = df_dummy.apply(singular_row_dot_product, axis =1, coefficients = coefficients,sigma_true = sigma_true, starting_sum =  0)
    df_dummy = pd.concat([df_dummy, ind_col], axis = 1 )
    log_odds = [x[0] for x in info]
    noise = [x[1] for x in info]
    #print(combo_info)
    df_dummy["log_odds"] = log_odds
    df_dummy["rowwise_noise"] = noise
    df_dummy.loc[(df_dummy[protected_group_ind] ==1), "log_odds"] = df_dummy.loc[(df_dummy[protected_group_ind] ==1), "log_odds"] + _Delta
    df_dummy.loc[(df_dummy[protected_group_ind] ==0), "log_odds"] = df_dummy.loc[(df_dummy[protected_group_ind] ==0), "log_odds"] - _Delta
    return df_dummy["log_odds"], map_coefficients, df_dummy["rowwise_noise"]

def select_subgroup_for_bias(num_covars, value_prob, df, cols, group_ind):
    covars = random.sample(cols, num_covars)
    #print(covars)
    s={}
    df = df[(df[group_ind]==1)]
    for covar in covars: 
        covar_values = list(df[covar].unique())
        include = np.random.binomial(size = len(covar_values), n=1, p= value_prob)
        covar_values_inc = []
        for idx,inc in enumerate(include):
            if inc == 1:
                covar_values_inc.append(covar_values[idx])
        if len(covar_values_inc) == 0: 
            covar_values_inc.append(random.sample(covar_values, 1)[0])
        s[covar] = covar_values_inc
    return s

def add_bias_shifted_log_odds_choose_new_outcomes(df, s, _delta, group_ind):
    if len(s.keys()) > 0:
        to_choose = df[s.keys()].isin(s).all(axis=1)
        df.loc[((to_choose) & (df[group_ind] == 1)), "true_log_odds"] = (df.loc[(to_choose & (df[group_ind] == 1)), "true_log_odds"] + _delta)
        #df.loc[((to_choose) & (df[group_ind] == 0)), "true_log_odds"] = (df.loc[((to_choose) & (df[group_ind] == 0)), "true_log_odds"] - _delta)
        log_odds_to_prob(df,"true_log_odds", "true_probs")
        df["ReoffendedWithinTwoYears"] = df["true_probs"].apply(singular_row_bernoulli_draw)

def add_bias_shifted_log_odds_non_move_choose_new_outcomes(df, s, _delta, group_ind):
    to_choose = df[s.keys()].isin(s).all(axis=1)
    df["true_log_odds_modified"] = 0
    df.loc[(to_choose & (df[group_ind] == 1)), "true_log_odds_modified"] = (df.loc[(to_choose & (df[group_ind] == 1)), "true_log_odds"] + _delta)
    df.loc[((to_choose) & (df[group_ind] == 0)), "true_log_odds_modified"] = (df.loc[((to_choose) & (df[group_ind] == 0)), "true_log_odds"] - _delta)
    log_odds_to_prob(df,"true_log_odds_modified", "true_probs_modified")
    df["ReoffendedWithinTwoYears"] = df["true_probs_modified"].apply(singular_row_bernoulli_draw)

def add_bias_shifted_log_odds_by_group_ind_choose_new_outcomes(df, _Delta, group_ind):
    #to_choose = df[s.keys()].isin(s).all(axis=1)
    df.loc[(df[group_ind] == 1), "true_log_odds"] = df.loc[ (df[group_ind] == 1), "true_log_odds"] + _Delta
    df.loc[ (df[group_ind] == 0), "true_log_odds"] = df.loc[  (df[group_ind] == 0), "true_log_odds"] - _Delta
    log_odds_to_prob(df,"true_log_odds", "true_probs")
    df["ReoffendedWithinTwoYears"] = df["true_probs"].apply(singular_row_bernoulli_draw)

def singular_row_gaussian_draw(x,mu,sd):
    #print(x)
    #print(mu)
    return(np.random.normal(loc = x+mu, scale = sd,size = 1)[0])

def singular_row_bernoulli_draw(x):
    return(np.random.binomial(size = 1, n =1, p = x)[0])

def add_bias_shifted_by_mu(df,s, mu, sigma, group_ind):
    df["predicted_log_odds"] = 0
    df["predicted_log_odds"] = df["true_log_odds"].apply(singular_row_gaussian_draw,  mu = 0, sd=sigma)
    if (len(s.keys()) >0):
        to_choose = df[s.keys()].isin(s).all(axis=1)
        #print(sum(to_choose))
        #df["predicted_log_odds"] = df["true_log_odds"].apply(singular_row_gaussian_draw,  mu = 0, s=sigma)
        df.loc[(((to_choose) & (df[group_ind] == 1))), "predicted_log_odds"] = df.loc[(((to_choose) & (df[group_ind] == 1))), "true_log_odds"].apply(singular_row_gaussian_draw,  mu = mu, sd=sigma)
        
    
    
    
def singular_row_log_odds_prob(x):
    e_log_odds = np.exp(x)
    return (e_log_odds/(1+e_log_odds))

def log_odds_to_prob(df, log_odds_col, new_probs_col_name):
    df[new_probs_col_name] = df[log_odds_col].apply(singular_row_log_odds_prob)
    
def compute_accuracy(df, s_bias, s_found, group_ind):
    df["i"] = list(range(0,len(df)))
    
    if len(s_bias.keys()) == 0: 
        df_s_bias_id = set(df.loc[ (df[group_ind] ==1), "i"])
    else:
        to_choose_s_bias = df[s_bias.keys()].isin(s_bias).all(axis=1)
        df_s_bias_id = set(df.loc[to_choose_s_bias & (df[group_ind] ==1), "i"])
    
    if len(s_found.keys()) == 0:
        df_s_found_id = set(df.loc[ (df[group_ind] ==1), "i"])
    else:
        to_choose_s_found = df[s_found.keys()].isin(s_found).all(axis=1)
        df_s_found_id  = set(df.loc[to_choose_s_found & (df[group_ind] ==1), "i"])
    
    intersection = set.intersection(df_s_bias_id,df_s_found_id)
    union = set.union(df_s_bias_id, df_s_found_id)
    
    #print("intersection: " + str(intersection))
    #print("union: " + str(union))
    return(float(len(intersection))/ float(len(union)))

def compute_recall(df, s_bias, s_found, group_ind):
    df["i"] = list(range(0,len(df)))
    
    if len(s_bias.keys()) == 0: 
        df_s_bias_id = set(df.loc[ (df[group_ind] ==1), "i"])
    else:
        to_choose_s_bias = df[s_bias.keys()].isin(s_bias).all(axis=1)
        df_s_bias_id = set(df.loc[to_choose_s_bias & (df[group_ind] ==1), "i"])
    
    if len(s_found.keys()) == 0:
        df_s_found_id = set(df.loc[ (df[group_ind] ==1), "i"])
    else:
        to_choose_s_found = df[s_found.keys()].isin(s_found).all(axis=1)
        df_s_found_id  = set(df.loc[to_choose_s_found & (df[group_ind] ==1), "i"])
    
    intersection = set.intersection(df_s_bias_id,df_s_found_id)
    union = set.union(df_s_bias_id, df_s_found_id)
    
    #print("intersection: " + str(intersection))
    #print("union: " + str(union))
    return(float(len(intersection))/ float(len(df_s_bias_id)))

def compute_precision(df, s_bias, s_found, group_ind):
    df["i"] = list(range(0,len(df)))
    
    if len(s_bias.keys()) == 0: 
        df_s_bias_id = set(df.loc[ (df[group_ind] ==1), "i"])
    else:
        to_choose_s_bias = df[s_bias.keys()].isin(s_bias).all(axis=1)
        df_s_bias_id = set(df.loc[to_choose_s_bias & (df[group_ind] ==1), "i"])
    
    if len(s_found.keys()) == 0:
        df_s_found_id = set(df.loc[ (df[group_ind] ==1), "i"])
    else:
        to_choose_s_found = df[s_found.keys()].isin(s_found).all(axis=1)
        df_s_found_id  = set(df.loc[to_choose_s_found & (df[group_ind] ==1), "i"])
    
    intersection = set.intersection(df_s_bias_id,df_s_found_id)
    union = set.union(df_s_bias_id, df_s_found_id)
    
    #print("intersection: " + str(intersection))
    #print("union: " + str(union))
    return(float(len(df_s_bias_id))/ float(len(union)))
    

def create_attributes_data_all_protected(df, label_col):
    d = {}
    for col_name in list(df):
        d[col_name] = 1
    d[label_col] = 2
    return pd.DataFrame([d])

def compute_accuracy_gerryfair(df, s_id_gerryfair, s_bias, group_ind):
    df["gerry_violated_group"] = s_id_gerryfair
    df["i"] = list(range(0,len(df)))
    
    if len(s_bias.keys()) == 0 :
        df_s_bias_id = set(df.loc[ (df[group_ind] ==1), "i"])
    else:
        to_choose_s_bias = df[s_bias.keys()].isin(s_bias).all(axis=1)
        df_s_bias_id = set(df.loc[to_choose_s_bias & (df[group_ind] ==1), "i"])
    
    df_s_found_id  = set(df.loc[((df["gerry_violated_group"] == 1) & (df[group_ind] ==1)), "i"])
    
    
    intersection = set.intersection(df_s_bias_id,df_s_found_id)
    union = set.union(df_s_bias_id, df_s_found_id)
    
    #print("intersection: " + str(intersection))
    #print("union: " + str(union))
    return(float(len(intersection))/ float(len(union)))

def compute_precision_gerryfair(df, s_id_gerryfair, s_bias, group_ind):
    df["gerry_violated_group"] = s_id_gerryfair
    df["i"] = list(range(0,len(df)))
    
    if len(s_bias.keys()) == 0 :
        df_s_bias_id = set(df.loc[ (df[group_ind] ==1), "i"])
    else:
        to_choose_s_bias = df[s_bias.keys()].isin(s_bias).all(axis=1)
        df_s_bias_id = set(df.loc[to_choose_s_bias & (df[group_ind] ==1), "i"])
    
    df_s_found_id  = set(df.loc[((df["gerry_violated_group"] == 1) & (df[group_ind] ==1)), "i"])
    
    
    intersection = set.intersection(df_s_bias_id,df_s_found_id)
    union = set.union(df_s_bias_id, df_s_found_id)
    
    #print("intersection: " + str(intersection))
    #print("union: " + str(union))
    return(float(len(df_s_bias_id))/ float(len(union)))

def compute_recall_gerryfair(df, s_id_gerryfair, s_bias, group_ind):
    df["gerry_violated_group"] = s_id_gerryfair
    df["i"] = list(range(0,len(df)))
    
    if len(s_bias.keys()) == 0 :
        df_s_bias_id = set(df.loc[ (df[group_ind] ==1), "i"])
    else:
        to_choose_s_bias = df[s_bias.keys()].isin(s_bias).all(axis=1)
        df_s_bias_id = set(df.loc[to_choose_s_bias & (df[group_ind] ==1), "i"])
    
    df_s_found_id  = set(df.loc[((df["gerry_violated_group"] == 1) & (df[group_ind] ==1)), "i"])
    
    
    intersection = set.intersection(df_s_bias_id,df_s_found_id)
    union = set.union(df_s_bias_id, df_s_found_id)
    
    #print("intersection: " + str(intersection))
    #print("union: " + str(union))
    return(float(len(intersection))/ float(len(df_s_bias_id)))



def create_attributes_data_one_protected(df, label_col, group_ind):
    d = {}
    for col_name in list(df):
        d[col_name] = 0
    d[group_ind] =1
    d[label_col] = 2
    return pd.DataFrame([d])

def pick_protected_class_bias_subset(df, cols, min_rows,num_affected, pr_subset, outcome_col_name,  outcome_var,_Delta):
    pr_boolean = False
    c_boolean = False
    while ((pr_boolean == False) | (c_boolean == False)):
        df_t = df.copy(deep = True)
        cols_t = copy.deepcopy(cols)
        key, key_value = pick_random_protected_group(df_t, cols_t)
        group_ind = key + "_" + str(key_value)

        df_t[group_ind] = 0
        df_t.loc[df_t[key] == key_value, group_ind] = 1
        del df_t[key]
        cols_t.remove(key)
        
        log_odds, map_coefficients = produce_true_log_odds(df_t, cols_t,group_ind, _Delta)
        df_t["true_log_odds"] = log_odds
        log_odds_to_prob(df_t,"true_log_odds", "true_probs")
        df_t["ReoffendedWithinTwoYears"] = df_t["true_probs"].apply(singular_row_bernoulli_draw)

        s_bias = select_subgroup_for_bias(num_affected, pr_subset, df_t, cols_t, group_ind)

        to_choose = df_t[s_bias.keys()].isin(s_bias).all(axis=1)

        len_p = len(df_t.loc[((to_choose) & (df_t[group_ind] == 1) & (df_t[outcome_col_name] == outcome_var))])
        len_c = len(df_t.loc[((to_choose) & (df_t[group_ind] == 0)  & (df_t[outcome_col_name] == outcome_var))])

        pr_boolean = int(len_p >= min_rows)
        c_boolean = int(len_c >= min_rows)
        
        if (pr_boolean & c_boolean):
            return df_t, cols_t, key, key_value, group_ind, s_bias, len_p, len_c, log_odds, map_coefficients

def pick_protected_class_bias_subset_no_filter(df, cols, min_rows,num_affected, pr_subset, _Delta, sigma_true, low = -1, high = 1):
    df_t = None
    pr_boolean = False
    c_boolean = False
    while ((pr_boolean == False) | (c_boolean == False)):
        df_t = df.copy(deep = True)
        cols_t = copy.deepcopy(cols)
        key, key_value = pick_random_protected_group(df_t, cols_t)
        group_ind = key + "_" + str(key_value)

        df_t[group_ind] = 0
        df_t.loc[df_t[key] == key_value, group_ind] = 1
        cols_t.remove(key)
        
        log_odds, map_coefficients, noise = produce_true_log_odds(df_t, cols_t,group_ind, _Delta, sigma_true, map_coefficients = {} , low = low, high= high)
        df_t["true_log_odds"] = log_odds
        df_t["rowwise_noise"] = noise
        log_odds_to_prob(df_t,"true_log_odds", "true_probs")
        df_t["ReoffendedWithinTwoYears"] = df_t["true_probs"].apply(singular_row_bernoulli_draw)

        s_bias = select_subgroup_for_bias(num_affected, pr_subset, df_t, cols_t, group_ind)

        to_choose = df_t[s_bias.keys()].isin(s_bias).all(axis=1)
        print("test if "+ str(s_bias) + " is a big enough subset to inject bias into")

        len_p = len(df_t.loc[((to_choose) & (df_t[group_ind] == 1))])
        len_c = len(df_t.loc[((to_choose) & (df_t[group_ind] == 0))])
        

        pr_boolean = int(len_p >= min_rows)
        c_boolean = int(len_c >= min_rows)
        
        if (pr_boolean & c_boolean):
            return df_t, cols_t, key, key_value, group_ind, s_bias, len_p, len_c, log_odds, map_coefficients
        

def pick_protected_class_bias_subset_no_filter_gaussian(df, cols, min_rows,num_affected, pr_subset, _Delta, sigma_true, sigma_coef):
    df_t = None
    pr_boolean = False
    c_boolean = False
    while ((pr_boolean == False) | (c_boolean == False)):
        df_t = df.copy(deep = True)
        cols_t = copy.deepcopy(cols)
        key, key_value = pick_random_protected_group(df_t, cols_t)
        group_ind = key + "_" + str(key_value)

        df_t[group_ind] = 0
        df_t.loc[df_t[key] == key_value, group_ind] = 1
        cols_t.remove(key)
        
        log_odds, map_coefficients, noise = produce_true_log_odds_gaussian(df_t, cols_t,group_ind, _Delta, sigma_true, map_coefficients = {} , sigma_coef =sigma_coef)
        df_t["true_log_odds"] = log_odds
        df_t["rowwise_noise"] = noise
        log_odds_to_prob(df_t,"true_log_odds", "true_probs")
        df_t["ReoffendedWithinTwoYears"] = df_t["true_probs"].apply(singular_row_bernoulli_draw)

        s_bias = select_subgroup_for_bias(num_affected, pr_subset, df_t, cols_t, group_ind)

        to_choose = df_t[s_bias.keys()].isin(s_bias).all(axis=1)
        print("test if "+ str(s_bias) + " is a big enough subset to inject bias into")

        len_p = len(df_t.loc[((to_choose) & (df_t[group_ind] == 1))])
        len_c = len(df_t.loc[((to_choose) & (df_t[group_ind] == 0))])
        

        pr_boolean = int(len_p >= min_rows)
        c_boolean = int(len_c >= min_rows)
        
        if (pr_boolean & c_boolean):
            return df_t, cols_t, key, key_value, group_ind, s_bias, len_p, len_c, log_odds, map_coefficients

def randomization_test(df, cols, sigma_true, sigma_coef, key, key_value):
    df_t = df.copy(deep = True)
    cols_t = copy.deepcopy(cols)
    group_ind = key + "_" + str(key_value)

    df_t[group_ind] = 0
    df_t.loc[df_t[key] == key_value, group_ind] = 1
    cols_t.remove(key)
        
    log_odds, map_coefficients, noise = produce_true_log_odds_gaussian(df_t, cols_t,group_ind, 0, sigma_true, map_coefficients = {} , sigma_coef =sigma_coef)
    df_t["true_log_odds"] = log_odds
    df_t["rowwise_noise"] = noise
    log_odds_to_prob(df_t,"true_log_odds", "true_probs")
    df_t["ReoffendedWithinTwoYears"] = df_t["true_probs"].apply(singular_row_bernoulli_draw)

    return df_t, cols_t, key, key_value, group_ind,  log_odds, map_coefficients

def pick_protected_class_bias_subset_no_filter_protected_class_given(df, key, key_value, cols, min_rows,num_affected, pr_subset, _Delta, sigma_true, map_coefficients):
    df_t = None
    pr_boolean = False
    c_boolean = False
    while ((pr_boolean == False) | (c_boolean == False)):
        df_t = df.copy(deep = True)
        cols_t = copy.deepcopy(cols)
        #key, key_value = pick_random_protected_group(df_t, cols_t)
        group_ind = key + "_" + str(key_value)

        df_t[group_ind] = 0
        df_t.loc[df_t[key] == key_value, group_ind] = 1
        cols_t.remove(key)
        
        log_odds, map_coefficients, noise = produce_true_log_odds(df_t, cols_t,group_ind, _Delta, sigma_true, map_coefficients)
        df_t["true_log_odds"] = log_odds
        df_t["rowwise_noise"] = noise
        log_odds_to_prob(df_t,"true_log_odds", "true_probs")
        df_t["ReoffendedWithinTwoYears"] = df_t["true_probs"].apply(singular_row_bernoulli_draw)
        
        if ((num_affected == 0) or (pr_subset == 0)):
            return df_t, cols_t, key, key_value, group_ind, {}, 0, 0, log_odds, map_coefficients
        
        s_bias = select_subgroup_for_bias(num_affected, pr_subset, df_t, cols_t, group_ind)
        

        to_choose = df_t[s_bias.keys()].isin(s_bias).all(axis=1)
        print("test if "+ str(s_bias) + " is a big enough subset to inject bias into")

        len_p = len(df_t.loc[((to_choose) & (df_t[group_ind] == 1))])
        len_c = len(df_t.loc[((to_choose) & (df_t[group_ind] == 0))])
        

        pr_boolean = int(len_p >= min_rows)
        c_boolean = int(len_c >= min_rows)
        
        if (pr_boolean & c_boolean):
            return df_t, cols_t, key, key_value, group_ind, s_bias, len_p, len_c, log_odds, map_coefficients

def shuffle_values(df, cols):
    for col in cols:
        l = list(df[col])
        random.shuffle(l)
        df[col] = l
    return df

    


